In [1]:
import numpy as np
import pandas as pd
import os
import pickle
In [2]:
DATA_DIRECTORY = "Q:\\p_eaglesense\\eaglesense\\data\\topviewkinect"
PREPROCESSED_DIRECTORY = DATA_DIRECTORY + "\\all"
if not os.path.exists(PREPROCESSED_DIRECTORY):
os.makedirs(PREPROCESSED_DIRECTORY)
In [3]:
FEATURE_SET = "test"
In [4]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
if not dataset_id.isdigit():
continue
elif dataset_id.startswith(("1", "3", "4", "6", "7")):
continue
features_csv = "{root}/{dataset}/features.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
features_df = pd.read_csv(features_csv)
labels_csv = "{root}/{dataset}/labels.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
labels_df = pd.read_csv(labels_csv)
if -1 in labels_df["activity"].values:
print(dataset_id, "Missing labels")
if 1 in labels_df["skeleton_id"].values:
print(dataset_id, "Multiple people labels")
if 1 in features_df["skeleton_id"].values:
print(dataset_id, "Multiple people features")
In [21]:
num_empty_labels = 0
num_nonempty_labels = 0
num_nonempty_detected = 0
num_nonempty_tracked = 0
In [22]:
features_time = list()
total_time = list()
In [23]:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
if not dataset_id.isdigit():
continue
elif dataset_id.startswith(("1", "3", "4", "6", "7")):
continue
features_csv = "{root}/{dataset}/features.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
features_df = pd.read_csv(features_csv)
labels_csv = "{root}/{dataset}/labels.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
labels_df = pd.read_csv(labels_csv)
nonempty_labels_df = labels_df.loc[labels_df["activity"] != 6]
nonempty_detected_labels_df = nonempty_labels_df.loc[nonempty_labels_df["skeleton_id"] >= 0]
nonempty_detected_frame_indices = nonempty_detected_labels_df["frame_id"].values
# activity tracked
activity_tracked_features_df = features_df.loc[features_df["frame_id"].isin(nonempty_detected_frame_indices)]
num_empty_labels += len(labels_df) - len(nonempty_labels_df)
num_nonempty_labels += len(nonempty_labels_df)
num_nonempty_detected += len(nonempty_detected_labels_df)
num_nonempty_tracked += len(activity_tracked_features_df)
# processing time
processing_csv = "{root}/{dataset}/processing.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
processing_df = pd.read_csv(processing_csv)
processing_df = processing_df.loc[processing_df["frame_id"].isin(nonempty_detected_frame_indices)]
features_time.extend(processing_df["features_time"].values)
total_time.extend(processing_df["total_time"].values)
In [28]:
avg_features_time = np.mean(features_time)
avg_features_time
Out[28]:
In [29]:
std_features_time = np.std(features_time)
std_features_time
Out[29]:
In [30]:
avg_total_time = np.mean(total_time)
avg_total_time
Out[30]:
In [31]:
std_total_time = np.std(total_time)
std_total_time
Out[31]:
In [37]:
num_empty_labels
Out[37]:
In [38]:
num_nonempty_labels
Out[38]:
In [39]:
total_frames = num_empty_labels + num_nonempty_labels
total_frames
Out[39]:
In [40]:
num_nonempty_labels / total_frames
Out[40]:
In [41]:
people_detection_accuracy = num_nonempty_detected / num_nonempty_labels
people_detection_accuracy
Out[41]:
In [20]:
num_nonempty_detected
Out[20]:
In [43]:
num_nonempty_tracked
Out[43]:
In [44]:
num_nonempty_tracked/ total_frames
Out[44]:
In [45]:
features_time / num_nonempty_tracked
Out[45]:
In [46]:
total_time / num_nonempty_tracked
Out[46]:
In [7]:
ignored_features_columns = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_features_columns
Out[7]:
In [8]:
ignored_labels_columns = ["frame_id", "skeleton_id"]
In [ ]:
all_features_csv = "{root}/{tag}_features.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
all_labels_csv = "{root}/{tag}_labels.csv".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
open(all_features_csv, "w").close()
open(all_labels_csv, "w").close()
header=True
with open(all_features_csv, "a") as features_f:
with open(all_labels_csv, "a") as labels_f:
for dataset_id in next(os.walk(DATA_DIRECTORY))[1]:
if not dataset_id.isdigit():
continue
elif dataset_id.startswith(("1", "3", "4", "6", "7")):
continue
else:
print(dataset_id, "... ", end="")
labels_csv = "{root}/{dataset}/labels.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
labels_df = pd.read_csv(labels_csv)
features_csv = "{root}/{dataset}/features.csv".format(root=DATA_DIRECTORY, dataset=dataset_id)
features_df = pd.read_csv(features_csv, low_memory=False)
evaluation_labels_df = labels_df.loc[labels_df["skeleton_id"] >= 0]
evaluation_labels_df = evaluation_labels_df.loc[evaluation_labels_df["activity"] != 6]
# evaluation_labels_df = evaluation_labels_df.loc[evaluation_labels_df["skeleton_id"] == 0]
evaluation_frame_indices = evaluation_labels_df["frame_id"].values
# activity tracked
evaluation_features_df = features_df.loc[features_df["frame_id"].isin(evaluation_frame_indices)]
# evaluation_features_df = evaluation_features_df[evaluation_features_df["skeleton_id"] == 0]
final_frame_indices = evaluation_features_df["frame_id"].values
evaluation_labels_df = evaluation_labels_df.loc[evaluation_labels_df["frame_id"].isin(final_frame_indices)]
evaluation_labels_df = evaluation_labels_df.drop(labels=ignored_labels_columns, axis=1)
evaluation_labels_df["subject"] = int(dataset_id)
evaluation_labels_df.to_csv(labels_f, header=header, index=False)
evaluation_features_df = evaluation_features_df.drop(labels=ignored_features_columns, axis=1)
evaluation_features_df["subject"] = int(dataset_id)
evaluation_features_df = evaluation_features_df.astype("float64")
evaluation_features_df.to_csv(features_f, header=header, index=False)
header = False
In [9]:
all_features_df = pd.read_csv(all_features_csv)
In [10]:
all_features_df.shape
Out[10]:
In [11]:
all_features_df.head()
Out[11]:
In [12]:
all_labels_df = pd.read_csv(all_labels_csv)
In [13]:
all_labels_df.shape
Out[13]:
In [14]:
all_labels_df.head()
Out[14]:
In [24]:
np.unique(all_labels_df["activity"])
Out[24]:
In [25]:
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]
num_activities = len(ACTIVITIES)
num_activities
Out[25]:
In [26]:
unique_subjects = all_features_df["subject"].unique()
unique_subjects
Out[26]:
In [27]:
num_subjects = len(unique_subjects)
num_subjects
Out[27]:
In [28]:
feature_vector = all_features_df.drop(["subject"], axis=1)
num_features = feature_vector.shape[1]
num_features
Out[28]:
In [29]:
s1_data_path = "{root}/{tag}_s1_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
s2_data_path = "{root}/{tag}_s2_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
cs_data_path = "{root}/{tag}_cs_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
noinfrared_data_path = "{root}/{tag}_cs_noinfrared_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
In [30]:
def samples_test_split(features_df, labels_df, train_test_ratio, seed):
# Get training sizes
training_sizes_per_subject = np.zeros((num_subjects, num_activities), dtype=np.int64)
for subject_idx, subject_id in enumerate(unique_subjects):
subject_activities = labels_df[labels_df["subject"] == subject_id]["activity"].values
subject_activities_bin = np.bincount(np.squeeze(subject_activities))
training_sizes_per_subject[subject_idx] = np.array([int(size * train_test_ratio) for size in subject_activities_bin])
# Get training and testing data
X_train = np.array([], dtype=np.float64).reshape(0, num_features)
y_train = np.array([], dtype=np.int32).reshape(0, 1)
X_test = np.array([], dtype=np.float64).reshape(0, num_features)
y_test = np.array([], dtype=np.int32).reshape(0, 1)
# Stratified sampling
for subject_idx, subject_id in enumerate(unique_subjects):
subject_features = features_df[features_df["subject"] == subject_id]
subject_features = subject_features.drop(["subject"], axis=1)
subject_labels = labels_df[labels_df["subject"] == subject_id]
subject_labels = subject_labels[["activity"]]
for activity_idx in range(num_activities):
num_activity_samples = training_sizes_per_subject[subject_idx, activity_idx]
activity_labels_df = subject_labels[subject_labels["activity"] == activity_idx]
activity_train_labels_df = activity_labels_df.sample(n=num_activity_samples, replace=False, random_state=seed)
activity_all_indices = list(activity_labels_df.index.values)
activity_train_indices = list(activity_train_labels_df.index.values)
activity_test_indices = [idx for idx in activity_all_indices if idx not in activity_train_indices]
acitivty_X_train = subject_features.ix[activity_train_indices]
activity_y_train = subject_labels.ix[activity_train_indices]
acitivty_X_test = subject_features.ix[activity_test_indices]
activity_y_test = subject_labels.ix[activity_test_indices]
X_train = np.vstack([X_train, acitivty_X_train.values])
y_train = np.vstack([y_train, activity_y_train.values])
X_test = np.vstack([X_test, acitivty_X_test.values])
y_test = np.vstack([y_test, activity_y_test.values])
return X_train, y_train, X_test, y_test
In [31]:
s1_X_train, s1_y_train, s1_X_test, s1_y_test = samples_test_split(all_features_df, all_labels_df, 1/3, seed=42)
In [32]:
s1_X_train.shape
Out[32]:
In [33]:
s1_X_test.shape
Out[33]:
In [34]:
s1_data = {
"X_train": s1_X_train,
"y_train": s1_y_train,
"X_test": s1_X_test,
"y_test": s1_y_test
}
with open(s1_data_path, "wb") as f:
pickle.dump(s1_data, f)
In [35]:
s2_X_train, s2_y_train, s2_X_test, s2_y_test = samples_test_split(all_features_df, all_labels_df, 2/3, seed=42)
In [36]:
s2_X_train.shape
Out[36]:
In [37]:
s2_X_test.shape
Out[37]:
In [38]:
s2_data = {
"X_train": s2_X_train,
"y_train": s2_y_train,
"X_test": s2_X_test,
"y_test": s2_y_test
}
with open(s2_data_path, "wb") as f:
pickle.dump(s2_data, f)
In [39]:
def crosssubject_test_split(features_df, labels_df, training_subjects_ids):
num_features = features_df.shape[1] - 1
X_train = np.array([], dtype=np.float64).reshape(0, num_features)
y_train = np.array([], dtype=np.int32).reshape(0, 1)
X_test = np.array([], dtype=np.float64).reshape(0, num_features)
y_test = np.array([], dtype=np.int32).reshape(0, 1)
for subject_id in unique_subjects:
subject_features = features_df[features_df["subject"] == subject_id]
subject_features = subject_features.drop(["subject"], axis=1)
subject_labels = labels_df[labels_df["subject"] == subject_id]
subject_labels = subject_labels[["activity"]]
subject_X = subject_features.values
subject_y = subject_labels.values
if subject_id in training_subjects_ids:
X_train = np.vstack([X_train, subject_X])
y_train = np.vstack([y_train, subject_y])
else:
X_test = np.vstack([X_test, subject_X])
y_test = np.vstack([y_test, subject_y])
return X_train, y_train, X_test, y_test
In [40]:
CS_TRAIN_SUBJECTS_ID = [2001, 2003, 2005, 2007, 2009, 2011]
In [41]:
cs_X_train, cs_y_train, cs_X_test, cs_y_test = crosssubject_test_split(all_features_df, all_labels_df, CS_TRAIN_SUBJECTS_ID)
In [42]:
cs_X_train.shape
Out[42]:
In [43]:
cs_X_test.shape
Out[43]:
In [44]:
cs_data = {
"X_train": cs_X_train,
"y_train": cs_y_train,
"X_test": cs_X_test,
"y_test": cs_y_test
}
with open(cs_data_path, "wb") as f:
pickle.dump(cs_data, f)
In [45]:
noinfrared_features_cols = list()
for feature in all_features_df.columns:
if feature.startswith("extreme_infrared_"):
continue
else:
noinfrared_features_cols.append(feature)
In [46]:
noinfrared_features_df = all_features_df[noinfrared_features_cols]
In [47]:
num_noinfrared_features = noinfrared_features_df.shape[1] - 1
num_noinfrared_features
Out[47]:
In [48]:
noinfrared_X_train, noinfrared_y_train, noinfrared_X_test, noinfrared_y_test = crosssubject_test_split(
noinfrared_features_df, all_labels_df, CS_TRAIN_SUBJECTS_ID)
In [49]:
noinfrared_X_train.shape
Out[49]:
In [50]:
noinfrared_X_test.shape
Out[50]:
In [51]:
noinfrared_cs_data = {
"X_train": noinfrared_X_train,
"y_train": noinfrared_y_train,
"X_test": noinfrared_X_test,
"y_test": noinfrared_y_test
}
with open(noinfrared_data_path, "wb") as f:
pickle.dump(noinfrared_cs_data, f)